notebook.community

Edit and run



In [66]:

    
import pandas as pd
import matplotlib.pyplot as plt
import re
from scipy.sparse import csr_matrix, coo_matrix
import numpy as np

%matplotlib inline



In [67]:

    
ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='iso-8859-1', sep = ';')
ratings.columns = ['user_id', 'isbn', 'book_rating']
books = pd.read_csv('BX-Books.csv', sep=';', encoding = 'iso-8859-1', dtype =str)

books["Book-Title"].nunique() == books["ISBN"].nunique()
book_dict = books[["Book-Title","ISBN"]].set_index("Book-Title").to_dict()["ISBN"]
books['new_isbn'] = books["Book-Title"].apply(lambda x: book_dict[x])
books["Book-Title"].nunique() == books["new_isbn"].nunique()
books['isbn'] = books['new_isbn']

del books['Image-URL-L']
del books['Image-URL-M']
del books['Image-URL-S']
del books['Book-Author']
del books['Publisher']
del books['ISBN']
del books['new_isbn']

newdf = ratings[ratings.book_rating>0]
joined = books.merge(newdf, on ='isbn')
print(newdf.shape)









    



(433671, 3)



In [92]:

    
bookinfo = pd.read_csv("goodreads_list_props.csv")
bookinfo2 = pd.read_csv("goodreads_list_props1.csv")



In [93]:

    
import pickle
bookinfo3 = pd.read_pickle("ibsn_features_full.pickle")



In [94]:

    
bookinfo.columns









    Out[94]:





Index(['book_name', 'author', 'rating', 'votes', 'description', 'book_type',
       'no_of_pages', 'first_published', 'isbn13', 'genre', 'link'],
      dtype='object')



In [95]:

    
bookinfo2.columns









    Out[95]:





Index(['book_name', 'author', 'rating', ' votes', ' description', 'book_type',
       'no_of_pages', 'first_published', 'isbn13', 'genre', 'link'],
      dtype='object')



In [96]:

    
bookinfo3.columns









    Out[96]:





Index(['isbn', 'description', 'num_pages', 'title'], dtype='object')



In [97]:

    
bookinfo3.columns = ['isbn13','description','no_of_pages','book_name']



In [98]:

    
bookinfo2.columns = bookinfo.columns
bookinfo = pd.concat([bookinfo,bookinfo2])
bookinfo = bookinfo[['isbn13','description','no_of_pages','book_name']]
bookinfo = pd.concat([bookinfo,bookinfo3])
bookinfo.drop_duplicates(inplace = True)



In [99]:

    
books.drop_duplicates(subset = 'isbn',inplace = True)



In [18]:

    
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys


def is_isbn10_valid(isbn):
    """
    Check ISBN-10 is valid.
    Code Implementaion from:
    http://en.wikipedia.org/wiki/International_Standard_Book_Number
    """
    if len(isbn) != 10:
        return False
    if ((not isbn[0:9].isdigit()) or
            ((isbn[-1] != 'X') and (not isbn[-1].isdigit()))):
        return False
    result = sum((10 - i) * (int(x) if x != 'X' else 10)
                 for i, x in enumerate(isbn))
    return result % 11 == 0


def is_isbn13_valid(isbn):
    """
    Check ISBN-13 is valid.
    Code Implemetation from:
    http://en.wikipedia.org/wiki/International_Standard_Book_Number
    """
    if len(isbn) != 13 or isbn.isdigit() is not True:
        return False
    check = (10 - (sum(int(digit) * (3 if idx % 2 else 1)
                       for idx, digit in enumerate(isbn[:12])) % 10)) % 10
    return check == int(isbn[-1])


def isbn13_to_isbn10(isbn13_str):
    """
    Convert ISBN-13 to ISBN-10.
    """
    num = 11 - sum((10 - i) * (int(x))
                   for i, x in enumerate(isbn13_str[3:12])) % 11
    if num == 10:
        check_digit = 'X'
    elif num == 11:
        check_digit = 0
    else:
        check_digit = num
    return isbn13_str[3:12] + str(check_digit)


def isbn10_to_isbn13(isbn10_str):
    """
    Convert ISBN-10 to ISBN-13.
    """
    check_digit = (
        10 - (sum(int(digit) * (3 if idx % 2 else 1)
                  for idx, digit in enumerate('978' + isbn10_str[:9])
                  ) % 10)) % 10
    return '978' + isbn10_str[:9] + str(check_digit)


def isbn_converter(isbn):
    """
    Convert isbn format to another format.
    """
    if is_isbn10_valid(isbn):
        result = isbn10_to_isbn13(isbn)
    elif is_isbn13_valid(isbn):
        result = isbn13_to_isbn10(isbn)
    else:
        return None
    return result


if __name__ == "__main__":
    for isbn_str in sys.argv[1:]:
        the_result = isbn_converter(isbn_str)
        if the_result:
            print(the_result)
        else:
            print("Bad ISBN " + isbn_str)









    



Bad ISBN -f
Bad ISBN C:\Users\vijay\AppData\Roaming\jupyter\runtime\kernel-54701759-7a9b-40b3-aed1-ecb74bfa38c3.json



In [19]:

    
isbn13 = []
for i in books['isbn']:
    isbn13.append(isbn_converter(i))



In [25]:

    
books['isbn13'] = isbn13



In [26]:

    
books.dropna(subset = ['isbn13'],inplace = True)
bookinfo.dropna(subset = ['isbn13'],inplace = True)



In [27]:

    
mergedinfo = bookinfo.merge(books,on = 'isbn13',how = 'inner')



In [28]:

    
import re
def striphtml(data):
    p = re.compile('<.*?>')
    try:
        return p.sub('', data)
    except:
        return None



In [29]:

    
mergedinfo['description'] = mergedinfo['description'].apply(lambda x: striphtml(x))
mergedinfo['description'] = mergedinfo['description'].str.strip()
mergedinfo['description'] = mergedinfo['description'].str.replace('“','').str.replace(',','').str.replace('"','')



In [30]:

    
from nltk.corpus import stopwords
# ...
filtereddesc = []
stops = set(stopwords.words("english"))
for desc in mergedinfo['description']:
    try:
        words = desc.split()
        filtereddesc.append([word for word in words if word not in stops])
    except:
        filtereddesc.append(None)



In [31]:

    
mergedinfo['filtered_description'] = filtereddesc



In [32]:

    
wordlist = []
for descs in mergedinfo['filtered_description']:
    sentence = []
    if descs is not None:
        for word in descs:
            sentence.append(word)
    wordlist.append(sentence)

Download google's word2vec model before running next line



In [33]:

    
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)









    



C:\Users\vijay\Anaconda2\envs\py35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")



In [34]:

    
genres = ['Science','Satire','Drama','Action','Romance','Mystery','Horror','Travel','Children','Religion','History','Biography','Autobiography','Fantasy']



In [35]:

    
scores = []
for desc in mergedinfo['filtered_description']:
    if desc is not None:
        gscore = []
        for genre in genres:
            simsum = 0
            n = 0
            for word in desc:
                try:
                    simsum = simsum + model.similarity(word,genre)
                    n = n + 1
                except:
                    continue
            if n!=0:
                gscore.append((simsum)/n)
            else:
                gscore.append(0)
        scores.append(gscore)
    else:
        scores.append(None)



In [36]:

    
editedscores = []
for score in scores:
    if score is not None:
        editedscores.append(score)
    else:
        editedscores.append([0] * 14)



In [37]:

    
scoredf = pd.DataFrame(editedscores,columns = [genre + '_Score' for genre in genres])



In [38]:

    
bookfeatures = pd.concat([mergedinfo,scoredf],axis = 1)

AMAZON DATASET



In [53]:

    
newbooks = pd.read_csv("Combine.csv")
newbooksisbn = newbooks['isbn']
newbooksisbn13 = []

for i in newbooksisbn:
    newbooksisbn13.append(isbn_converter(i))

newbooksuniqueisbn13 = list(set(newbooksisbn13))
amazonbookfeatures = bookinfo[bookinfo['isbn13'].isin(newbooksuniqueisbn13)]



In [44]:

    
amazonbookfeatures['description'] = amazonbookfeatures['description'].apply(lambda x: striphtml(x))
amazonbookfeatures['description'] = amazonbookfeatures['description'].str.strip()
amazonbookfeatures['description'] = amazonbookfeatures['description'].str.replace('“','').str.replace(',','').str.replace('"','')









    



C:\Users\vijay\Anaconda2\envs\py35\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
C:\Users\vijay\Anaconda2\envs\py35\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
C:\Users\vijay\Anaconda2\envs\py35\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()



In [45]:

    
filtereddesc = []
stops = set(stopwords.words("english"))
for desc in amazonbookfeatures['description']:
    try:
        words = desc.split()
        filtereddesc.append([word for word in words if word not in stops])
    except:
        filtereddesc.append(None)



In [46]:

    
amazonbookfeatures['filtered_description'] = filtereddesc
wordlist = []
for descs in amazonbookfeatures['filtered_description']:
    sentence = []
    if descs is not None:
        for word in descs:
            sentence.append(word)
    wordlist.append(sentence)









    



C:\Users\vijay\Anaconda2\envs\py35\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [47]:

    
scores = []
for desc in amazonbookfeatures['filtered_description']:
    if desc is not None:
        gscore = []
        for genre in genres:
            simsum = 0
            n = 0
            for word in desc:
                try:
                    simsum = simsum + model.similarity(word,genre)
                    n = n + 1
                except:
                    continue
            if n!=0:
                gscore.append((simsum)/n)
            else:
                gscore.append(0)
        scores.append(gscore)
    else:
        scores.append(None)



In [48]:

    
editedscores = []
for score in scores:
    if score is not None:
        editedscores.append(score)
    else:
        editedscores.append([0] * 14)



In [49]:

    
scoredf = pd.DataFrame(editedscores,columns = [genre + '_Score' for genre in genres])



In [61]:

    
amzbookfeatures = pd.concat([amazonbookfeatures.reset_index(drop=True),scoredf],axis = 1)